from copy import deepcopy
import gensim
import re
from gensim.models import Word2Vec
import random
data = []

# 读取数据文件，将每行数据按空格、逗号、句号、问号、感叹号分割，并添加到data列表中
def read_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            # 按 [.!?] 分割行
            sub_sentences = re.split(r'[.!?"]', line)
            for sub_sentence in sub_sentences:
                # 去除多余空格，并按空格和逗号进一步拆分
                sub_sentence = re.sub(r'\s+', ' ', sub_sentence)
                sub_sentence = re.split(r'[,\s]', sub_sentence)
                # 将拆分后的词添加到data列表中
                data.append(sub_sentence)
            


def train_model(data):
    model = Word2Vec(data, vector_size=50, window=5, min_count=1, workers=4, sg=0)
    return model
    
# 生成文本，使用训练好的模型，以seed_words作为种子词，生成num_words个词
def generate_text(model, seed_words, num_words):
    # 将种子词列表中的词用空格连接成一个字符串
    text = deepcopy(seed_words)
    # 通过束搜索生成文本
    for _ in range(num_words):
        # 获取当前词的向量
        pos_vector = model.wv[seed_words[:-2]]
        neg_vector = model.wv[seed_words[:-1]]
        # 获取与当前词最相似的词
        similar_words = model.wv.most_similar(positive=pos_vector, negative = neg_vector, topn=5)
        # 随机选择一个词
        word = random.choice(similar_words)[0]
        
        # 将选择的词添加到文本中
        text.append(word)
        # 将选择的词添加到种子词列表中
        seed_words.append(word)
        # 将种子词列表中的第一个词删除
        seed_words = seed_words[1:]
    # 返回生成的文本
    return text


if __name__ == '__main__':
    
    # 读取数据
    read_data('data.txt')
    # 训练模型
    model = train_model(data)
    # 设置种子词
    seed_words = ['I', 'want', 'a']

    for _ in range(3):
        # 生成文本
        seed_words = generate_text(model, seed_words, 2)
    # 打印生成的文本
    print('\n', " ".join(seed_words),'\n')